#################################
# Media Measurement Matters     #
# Replication Code              #
# Appendix Q: Weighted Results  #
#################################

# The following file contains code for replicating the figures and analyses in
# Appendix Q. This section replicates all of the main results in the paper using
# sampling weights.

# Set-Up ----

# If desired, set up path into which to save plots and tables
plot_path <- NULL
table_path <- NULL

# Load packages
library(tidyverse)
library(ggridges)
library(ggpubr)
library(gtools)
library(estimatr)
library(anesrake)
library(survey)
library(srvyr)
library(dineq)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Set up colors
red_mit = '#A31F34'
red_light = '#A9606C'
blue_mit = '#315485'
grey_light= '#C2C0BF'
grey_dark = '#8A8B8C'
black = '#353132'

# Source helper functions
source("helper_functions.R")

# Set seed
set.seed(531)

# > Read in data ----

# Read in survey data
srvy <- read_rds("data/survey_data_cleaned.rds")

# Read in web data
web <- read_rds("data/web_use.rds")

# Filter out portal sites
web_noport <- web %>% 
  filter(domain_recode %notin% c("aol.com", "msn.com", "google.com"))

# Construct survey weights ----

# > Specify marginal population distributions ----

# All quantities are based on the 2017 American Community Survey 1-year estimates,
# except for partisanship, which is based on Pew Research Center study.

# Total population
total_denom <- 325719178

# Gender: from Age and Sex Table (2017 ACS 1-year estimates)
  # Categories: Male, Female
gender <- c(160402504, 165316674)/total_denom # Male, Female

# Age: from Age and Sex Table (2017 ACS 1-year estimates)
  # Categories: 18-24, 25-34, 35-44, 45-54, 55-64, 65+ (matching age_cat variable in srvy)

age_denom <- 252070495	 # Total count of 18 years and over

age_over19 <- c(21950055, # 20-24
                23085789, 21879946, # 25-34
                21373244,	19744661, # 35-44
                20948890, 21382065, # 45-54
                21765184, 20254592, # 55-64
                16926936, 12804940, 8854621, 5969742, 6259473) # 65+ 
age_1819 <- age_denom - sum(age_over19)

age_cat <- c(age_over19[1] + age_1819, # 18-24
             sum(age_over19[2:3]), # 25-34
             sum(age_over19[4:5]), # 35-44
             sum(age_over19[6:7]), # 45-54
             sum(age_over19[8:9]), # 55-64
             sum(age_over19[10:14])) # 65+

age <- age_cat/age_denom

# Race/ethnicity: from Hispanic or Latino Origin by Race Table (2017 ACS 1-Year Estimates)
  # Categories: White, Non-White
race_cat <- c(197285202, # White (non-Hispanic)
              40129593, # Black (non-Hispanic)
              58846134, # Hispanic 
              sum(c(833898, 7932565, 2145162, 17999846, 546778))) # Other race/ethnicity

race <- race_cat/total_denom
race_bin <- c(race_cat[1], sum(race_cat[2:4]))/total_denom # White vs. non-white

# Geographic region: from Total Population (2017 ACS 1-Year Estimates)
  # Categories: Northeast, Midwest, South, West

region_cat <- c(56470581, 68179351, 123658624, 77410622)

region <- region_cat/total_denom

# Party ID: based on Pew Research Center Party Identification Trends, 1992-2017 (2017 estimates)
  # Stable link: https://web.archive.org/web/20180428054630/https://www.people-press.org/2018/03/20/party-identification-trends-1992-2017/
  # Categories: Democrat, Republican, Independent (with leaners coded as partisans)

pid <- c(.5, .42, .08) 

# > Recode survey data to match categories ----

# Recode survey data to match the categories specified above (as integers)
srvy <- srvy %>% 
  mutate(age_cat_num = as.numeric(age_cat),
         race_num = case_when(race_white == 1 ~ 1,
                              race_white == 0 ~ 2),
         pid_num = case_when(pid == -1 ~ 1,
                             pid == 1 ~ 2,
                             pid == 0 ~ 3))

# Examine marginal distributions for each variable in our unweighted dataset
srvy_dems <- srvy %>% 
  select(gender, age_cat_num, race_num, region_num, pid_num)

srvy_dem_props <- map(names(srvy_dems), function(.x){
  round(prop.table(table(srvy_dems[,.x])), 3)
})
names(srvy_dem_props) <- names(srvy_dems)
srvy_dem_props

# > Estimate weights ----

# Set list of targets and match names to variables
targets <- list(gender, age, race_bin, region, pid)
names(targets) <- c("gender", "age_cat_num", "race_num", "region_num", "pid_num")

# Estimate weights using anesrake package
anesrakefinder(targets, srvy %>% as.data.frame(), choosemethod = "total")

output <- anesrake(targets, srvy %>% as.data.frame(), caseid = srvy$resp_id,
                   cap = 5, choosemethod = "total",
                   type = "pctlim", pctlim = .05 , nlim = 5,
                   iterate = TRUE , force1 = TRUE)

# Merge weights into dataset
weight_df <- data.frame(resp_id = output$caseid, rake_weight = output$weightvec)

srvy <- srvy %>% 
  left_join(weight_df, by = "resp_id")

# Check post-weight demographics
round(wpct(srvy$gender, weight = srvy$rake_weight), 3); round(gender, 3)
round(wpct(srvy$age_cat, weight = srvy$rake_weight), 3); round(age, 3)
round(wpct(srvy$race_white == 1, weight = srvy$rake_weight), 3); round(race_bin, 3)
round(wpct(srvy$region_num, weight = srvy$rake_weight), 3); round(region, 3)
round(wpct(srvy$pid_num, weight = srvy$rake_weight), 3); round(pid, 3)

# Merge weights into web data
web_noport <- web_noport %>% 
  left_join(weight_df, by = "resp_id")

# Descriptive results ----

# > Relative volume ----

# Figure Q1: plot the distributions of relative volume scores by stated media
# preferences, with and without incorporating sampling weights

q1_a <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), x_var = "news_consump",
                    x_label = "Proportion of Visits to News Domains\n(Unweighted)",
                    y_label = "Stated Media Preference", by = 0.25,
                    height = 1.85, x_limits = c(-0.1, 1.05)) # Unweighted

ggsave(q1_a, path = plot_path, filename = "fig_q1_a.pdf",
       height= 3, width = 4.5, dpi = 600)

q1_b <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), x_var = "news_consump",
                    x_label = "Proportion of Visits to News Domains\n(Weighted)",
                    y_label = "Stated Media Preference", by = 0.25,
                    height = 1.85, x_limits = c(-0.1, 1.05), 
                    weights = TRUE) # Weighted

ggsave(q1_b, path = plot_path, filename = "fig_q1_b.pdf",
       height= 3, width = 4.5, dpi = 600)

# > Relative slant ----

# Figure Q2: plot the distributions of relative slant scores (respondent-level) by stated media
# preferences, with and without incorporating sampling weights
q2_a <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), 
                    x_var = "score_noportals",
                    x_label = "Respondent Avg. Slant Score\n(Unweighted)",
                    y_label = "Stated Media Preference") # Unweighted

ggsave(q2_a, path = plot_path, filename = "fig_q2_a.pdf",
       height= 3, width = 4.5, dpi = 600)

q2_b <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), 
                    x_var = "score_noportals",
                    x_label = "Respondent Avg. Slant Score\n(Weighted)",
                    y_label = "Stated Media Preference", weights = TRUE) # Weighted

ggsave(q2_b, path = plot_path, filename = "fig_q2_b.pdf",
       height= 3, width = 4.5, dpi = 600)

# Figure Q3: plot the distributions of relative slant scores (visit-level) by stated media
# preferences, with and without incorporating sampling weights

q3_a <- visit_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"),
                    x_label = "Relative Slant of News Visits (Unweighted)",
                    y_label = "Stated Media Preference", df = web_noport, 
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com"), 
                    x_limits = c(-1.1, 1.1)) # Unweighted

ggsave(q3_a, path = plot_path, filename = "fig_q3_a.pdf",
       height= 3.5, width = 8, dpi = 600)

q3_b <- visit_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"),
                    x_label = "Relative Slant of News Visits (Weighted)",
                    y_label = "Stated Media Preference", df = web_noport, 
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com"), 
                    weights = TRUE, x_limits = c(-1.1, 1.1)) # Weighted

ggsave(q3_b, path = plot_path, filename = "fig_q3_b.pdf",
       height= 3.5, width = 8, dpi = 600)

# Experimental results ----

# Set theme for plotting
theme_set(theme_bw() + 
            theme(legend.position = "bottom",
                  plot.title = element_text(hjust = 0.5, face = "bold",size = 16),
                  plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
                  axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"),
                                              face = "bold", size = 12, angle = 0, hjust = 0.5),
                  axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), 
                                              face = "bold", size = 12),
                  legend.title = element_text(face = "bold", hjust = 0.5, size = 12),
                  legend.text = element_text(hjust = 0.5, size = 10),
                  axis.text.y = element_text(size = 10, color = "black"),
                  axis.text.x = element_text(size = 10, color = "black"),
                  legend.box = "vertical",
                  legend.background = element_blank(),
                  legend.box.background = element_rect(colour = "black"),
                  text=element_text(colour=black, 
                                    size=15)))

# > Relative volume ----

# Construct weighted bins
weight_bins <- function(score_var, nbins){
  if(class(srvy[, score_var])[1] == "numeric"){
    vec <- srvy[, score_var]
  }else{
    vec <- srvy[, score_var] %>% pull()
  }
  out <- ntiles.wtd(vec, nbins, weights = srvy$rake_weight)
  
  return(out)
}

# Construct weighted terciles for relative volume
srvy$news_consump_bin3_wt <- weight_bins(score_var = "news_consump", nbins = 3)

# Create labels for plotting
consump_labels3_wt <- gen_ranges(bin_var = "news_consump_bin3_wt", 
                                 score_version = "news_consump",
                                 parentheses = TRUE)
consump_labels3_wt <- paste0(c("Low News\nConsumption\n", "Medium News\nConsumption\n", 
                               "High News\nConsumption\n"), 
                             consump_labels3_wt)

# Estimate relative volume results, incorporating sampling weights
consump_vsent_fx3_wt <- vsent_plot(var = "news_consump", nbins = 3, 
                                   labels = consump_labels3_wt, weights = TRUE)

# Figure Q4: persuasion results by relative volume preferences, incorporating sampling weights
(q4 <- ggplot(na.omit(consump_vsent_fx3_wt %>% filter(id != "Fox vs.\nMSNBC")), 
                                      aes(x=factor(val),
                                          col = factor(id, levels = c("Fox vs.\nEntertainment",
                                                                      "MSNBC vs.\nEntertainment")),
                                          shape = factor(id, levels = c("Fox vs.\nEntertainment",
                                                                        "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", color = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) +
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(consump_vsent_fx3_wt$bin)) + 
    xlab("Relative Volume of News vs. Non-News") +
    ylab("Average Treatment Effect of Partisan\nMedia vs. Entertainment (Weighted)") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels()$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels()$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) +
    theme(axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, color = "black")))

ggsave(q4, path = plot_path, filename = "fig_q4.pdf", 
       dpi = 600, width=9, height=5.25)

# > Relative slant ----

# Construct labels for plotting
code_labels <- gen_ranges(bin_var = "score_code", score_version = "score_noportals",
                          parentheses = TRUE)
code_labels <- paste(c("More Liberal\nThan CNN\n", "Between CNN\nand Yahoo!\n", 
                       "More Conserv.\nThan Yahoo!\n"), 
                     code_labels, sep = "")

code_labels_hard <- gen_ranges(bin_var = "score_code_hard", score_version = "score_hard",
                               parentheses = TRUE)
code_labels_hard <- paste(c("More Liberal\nThan CNN\n", "Between CNN\nand Yahoo!\n", 
                            "More Conserv.\nThan Yahoo!\n"), 
                          code_labels_hard, sep = "")

# Estimate relative slant results, using all URLs and incorporating sampling weights
score_vsent_code_wt <- group_vsent_plot(var = "score_code", nbins = 3, 
                                        labels = code_labels, weights = TRUE)

# Estimate relative slant results, using only hard news URLs and incorporating sampling weights
score_vsent_hard_wt <- group_vsent_plot(var = "score_code_hard", nbins = 3, 
                                        labels = code_labels_hard, weights = TRUE)

# Figure Q5(a): persuasion results by relative slant preferences, incorporating sampling weights
# and measuring preferences using all URL visits
(q5_a <- ggplot(na.omit(score_vsent_code_wt %>% filter(id != "Fox vs.\nMSNBC")), 
                              aes(x=factor(val),
                                  col = factor(id, levels = c("Fox vs.\nEntertainment",
                                                              "MSNBC vs.\nEntertainment")),
                                  shape = factor(id, levels = c("Fox vs.\nEntertainment",
                                                                "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", color = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) +
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(score_vsent_code_wt$bin)) + 
    xlab("Relative Slant of News Consumption") +
    ylab("Average Treatment Effect of Partisan\nMedia vs. Entertainment (Weighted)") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels()$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels()$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) +
    theme(axis.text.x = element_text(size = 10, hjust = 0.5, color = "black")))

ggsave(q5_a, path = plot_path, filename = "fig_q5_a.pdf", 
       dpi = 600, width=9, height=5.25)

# Figure Q5(b): persuasion results by relative slant preferences, incorporating sampling weights
# and measuring preferences using only hard news URLs
(q5_b <- ggplot(na.omit(score_vsent_hard_wt %>% filter(id != "Fox vs.\nMSNBC")), 
                aes(x=factor(val),
                    col = factor(id, levels = c("Fox vs.\nEntertainment",
                                                "MSNBC vs.\nEntertainment")),
                    shape = factor(id, levels = c("Fox vs.\nEntertainment",
                                                  "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", color = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) +
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(score_vsent_hard_wt$bin)) + 
    xlab("Relative Slant of News Consumption (Hard News Only)") +
    ylab("Average Treatment Effect of Partisan\nMedia vs. Entertainment (Weighted)") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels()$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels()$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) +
    theme(axis.text.x = element_text(size = 10, hjust = 0.5, color = "black")))

ggsave(q5_b, path = plot_path, filename = "fig_q5_b.pdf", 
       dpi = 600, width=9, height=5.25)

# > Stated media preferences ----

# Construct labels for plotting
med_pref_labs <- paste0("Prefer\n", c("MSNBC", "Entertainment", "Fox"))

# Estimate stated preference results, incorporating sampling weights
pref_vsent_fx_wt <- group_vsent_plot(var = "med_pref_num", nbins = 3, 
                                     labels = med_pref_labs, weights = TRUE)

# Figure Q6: persuasion results by stated media preferences, incorporating sampling weights
(q6 <- ggplot(na.omit(pref_vsent_fx_wt %>% filter(id != "Fox vs.\nMSNBC")), 
                                    aes(x=factor(val),
                                        col = factor(id, levels = c("Fox vs.\nEntertainment",
                                                                    "MSNBC vs.\nEntertainment")),
                                        shape = factor(id, levels = c("Fox vs.\nEntertainment", 
                                                                      "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", col = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) +
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(pref_vsent_fx_wt$bin)) + 
    xlab("Stated Media Preference") +
    ylab("Average Treatment Effect of Partisan\nMedia vs. Entertainment (Weighted)") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels()$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels()$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) +
    theme(axis.text.x = element_text(size = 10, hjust = 0.5, color = "black")))

ggsave(q6, path = plot_path, filename = "fig_q6.pdf", 
       dpi = 600, width=9, height=5.25)
